import altair as alt
import pandas as pd
from vega_datasets import data
# import vega
import geopandas as gpd
path = 'datasets/crimedata.csv'
crime_data = pd.read_csv(path)
crime_data.head()
| Unnamed: 0 | Neighbourhood | Total - Age groups and average age of the population - 100% data | 0 to 14 years...3 | 0 to 4 years...4 | 5 to 9 years...5 | 10 to 14 years...6 | 15 to 64 years...7 | 15 to 19 years...8 | 20 to 24 years...9 | ... | MONTH | DAY | HOUR | MINUTE | HUNDRED_BLOCK | X | Y | Population density | Average cost of house in neighbour | Average income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Arbutus-Ridge | 15295.0 | 2015.0 | 455.0 | 685.0 | 880.0 | 9805.0 | 1230.0 | 1165.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2 | Downtown | 62030.0 | 4000.0 | 2080.0 | 1105.0 | 810.0 | 51275.0 | 1180.0 | 4050.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 3 | Dunbar-Southlands | 21425.0 | 3545.0 | 675.0 | 1225.0 | 1650.0 | 14215.0 | 1800.0 | 1740.0 | ... | 11.0 | 15.0 | 14.0 | 30.0 | 29XX W 31ST AVE | 487516.1816 | 5454623.638 | NaN | NaN | NaN |
| 3 | 4 | Dunbar-Southlands | 21425.0 | 3545.0 | 675.0 | 1225.0 | 1650.0 | 14215.0 | 1800.0 | 1740.0 | ... | 7.0 | 11.0 | 21.0 | 0.0 | 29XX W 31ST AVE | 487579.6067 | 5454613.684 | NaN | NaN | NaN |
| 4 | 5 | Dunbar-Southlands | 21425.0 | 3545.0 | 675.0 | 1225.0 | 1650.0 | 14215.0 | 1800.0 | 1740.0 | ... | 4.0 | 25.0 | 21.0 | 54.0 | 29XX W 33RD AVE | 487585.2638 | 5454405.082 | NaN | NaN | NaN |
5 rows × 269 columns
#Creating the desired dataframe
# Discard year 2023 as it does not have complete data (current year)
df = crime_data[crime_data['YEAR'] < 2023]
#Combine different TYPEs into a couple similar types
df_ym = df.groupby(['YEAR', 'MONTH']).count()
df_ym = df_ym.rename(columns={"Unnamed: 0": "Count",})
df_ym = pd.DataFrame(df_ym.to_records())
# Add a new column with the month names
df_ym["MONTH"] = df_ym['MONTH'].apply(lambda x: pd.Timestamp(year=2000, month=int(x), day=1).strftime('%b'))
df_ym.head()
| YEAR | MONTH | Count | Neighbourhood | Total - Age groups and average age of the population - 100% data | 0 to 14 years...3 | 0 to 4 years...4 | 5 to 9 years...5 | 10 to 14 years...6 | 15 to 64 years...7 | ... | TYPE | DAY | HOUR | MINUTE | HUNDRED_BLOCK | X | Y | Population density | Average cost of house in neighbour | Average income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2003.0 | Jan | 5043 | 5042 | 3740 | 3740 | 3740 | 3740 | 3740 | 3740 | ... | 5043 | 5043 | 5043 | 5043 | 5043 | 5042 | 5042 | 0 | 0 | 0 |
| 1 | 2003.0 | Feb | 4250 | 4250 | 3283 | 3283 | 3283 | 3283 | 3283 | 3283 | ... | 4250 | 4250 | 4250 | 4250 | 4250 | 4250 | 4250 | 0 | 0 | 0 |
| 2 | 2003.0 | Mar | 4665 | 4658 | 3583 | 3583 | 3583 | 3583 | 3583 | 3583 | ... | 4665 | 4665 | 4665 | 4665 | 4665 | 4658 | 4658 | 0 | 0 | 0 |
| 3 | 2003.0 | Apr | 4895 | 4894 | 3714 | 3714 | 3714 | 3714 | 3714 | 3714 | ... | 4895 | 4895 | 4895 | 4895 | 4895 | 4894 | 4894 | 0 | 0 | 0 |
| 4 | 2003.0 | May | 5439 | 5431 | 4057 | 4057 | 4057 | 4057 | 4057 | 4057 | ... | 5439 | 5439 | 5439 | 5439 | 5438 | 5431 | 5431 | 0 | 0 | 0 |
5 rows × 269 columns
# Selector
selector = alt.selection_single(fields=['YEAR'])
# Create the Base Graph
base = alt.Chart(df_ym).properties(
width=350,
height=300,
).add_selection(selector)
# Create the Line Chart
lines = base.mark_line().encode(
x=alt.X('YEAR:O', title="Year"),
y=alt.Y('sum(Count):Q', title="Number of Crimes"),
opacity=alt.condition(selector, alt.value(1), alt.value(0.1)),
tooltip=[
alt.Tooltip('YEAR'),
alt.Tooltip('sum(Count)')
]
)
lines = lines.mark_line(point=True,color='orange').encode(
opacity=alt.condition(selector, alt.value(1), alt.value(0.25))
).add_selection(selector)
# Create the Bar Chart
bar = base.mark_bar(opacity=1, thickness=100).encode(
x=alt.X('MONTH:O', axis=alt.Axis(labelAngle=-30), title="Month", sort="-y"),
y=alt.Y('sum(Count)', title="Crime by Month"),
tooltip=[
alt.Tooltip('MONTH:O', title="Month"),
alt.Tooltip('sum(Count)', title='Incedents of Crime'),
#alt.Tooltip('YEAR', title='Year')
]
).transform_filter( # Add this method to filter the data based on the selected year
selector
)
complete = (lines | bar).properties(
title="Trend of Crime in Vancouver Linked to Crime by Month"
).configure_point(
size=75
)
complete
# Used https://altair-viz.github.io/gallery/scatter_with_layered_histogram.html as a resource
# Discard year 2023 as it does not have complete data (current year)
df = crime_data[crime_data['YEAR'] < 2023]
#Combine different TYPEs into a couple similar types
df = df.replace('Break and Enter Commercial','Break And Enter')
df = df.replace('Break and Enter Residential/Other','Break And Enter')
df = df.replace('Homicide','Offence Against a Person')
df = df.replace('Other Theft','Other Theft')
df = df.replace('Theft from Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Bicycle','Vehicle Related Theft')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Fatality)','Traffic Accident')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Injury)','Traffic Accident')
# Look at subsection of neighbourhoods in Downtown
neighbourhoods = ['Strathcona', 'Grandview-Woodland', 'Hastings-Sunrise', 'Mount Pleasant', 'Fairview']
df2 = df.loc[df['Neighbourhood'].isin(neighbourhoods)]
# Second Vis:
# Select Neighbourhood, Mean Age, Total Population, and Type of Crime
df = df2.groupby(['Neighbourhood', 'TYPE', 'Average age of the population',
"Total - Age groups and average age of the population - 100% data"]).count()
df = df.rename(columns={"Unnamed: 0": "Count",})
df = pd.DataFrame(df.to_records())
df.head()
# Selector
selector = alt.selection_single(fields = ['Neighbourhood'])
#Create the Base Graph
base = alt.Chart(df).properties(
width=300,
height=250,
).add_selection(selector)
# Create the ScatterPlot
points = base.mark_circle(size = 200).encode(
x=alt.X('Average age of the population', scale = alt.Scale(domain = [30, 55]), title = "Average Age of the Population"),
y=alt.Y('Total - Age groups and average age of the population - 100% data',
scale = alt.Scale(domain = [10000, 40000]), title = "Total Population"),
color='Neighbourhood:N',
opacity=alt.condition(selector, alt.value(1), alt.value(0.1)),
tooltip = [
alt.Tooltip('Average age of the population'),
alt.Tooltip('Neighbourhood:N'),
alt.Tooltip('Total - Age groups and average age of the population - 100% data:N',
title = 'Total Population in the Neighbourhood')
]
)
#Create the Bar Chart
bar = base.mark_bar(opacity=1, thickness=100).encode(
x=alt.X('TYPE:N', axis=alt.Axis(labelAngle=-30), title = "Type of Crime"),
y=alt.Y('sum(Count)', title = "Incidents of Crime"),
color= 'Neighbourhood:N',
opacity=alt.condition(selector, alt.value(1), alt.value(0.25)),
tooltip = [
alt.Tooltip('TYPE', title = "Type of Crime"),
alt.Tooltip('Neighbourhood:N'),
alt.Tooltip('sum(Count)',
title = 'Incedents of Crime')
]
)
# Combine the two visualizations
complete = (points | bar).properties(
title = "Age to Population Scatterplot Linked to Crime For Each Type Stacked Bar Chart by Neighbourhood (Downtown area)"
)
complete
import altair as alt
from vega_datasets import data
import requests
import json
vancouver_url = 'https://opendata.vancouver.ca/explore/dataset/local-area-boundary/download/?format=geojson&timezone=America/Los_Angeles'
df = crime_data['Neighbourhood'].value_counts()
df = df.to_frame()
df = df.rename(columns={"Neighbourhood": "count"})
df = df.rename_axis('Neighbourhood').reset_index()
gdf = gpd.read_file(vancouver_url)
gdf = gdf.rename(columns={'name': 'Neighbourhood'})
gdf = gdf.merge(df, on='Neighbourhood')
neighbours = gdf['Neighbourhood'].unique() # get unique field values
selectNeighbourhood = alt.selection_single(
name='Select', # name the selection 'Select'
fields=['Neighbourhood'], # limit selection to the Major_Genre field
init={'Neighbourhood': neighbours[0]}, # use first genre entry as initial value
bind=alt.binding_select(options=neighbours) # bind to a menu of unique genre values
)
gdf_projected = gdf.to_crs("EPSG:32610") # You can replace EPSG:32610 with an appropriate EPSG code for your area
gdf_projected["centroid"] = gdf_projected["geometry"].centroid
gdf_projected["centroid"] = gdf_projected["centroid"].to_crs(gdf.crs)
gdf["centroid_lng"] = gdf_projected["centroid"].apply(lambda point: point.x)
gdf["centroid_lat"] = gdf_projected["centroid"].apply(lambda point: point.y)
data = alt.InlineData(values = gdf.__geo_interface__, #geopandas to geojson
# root object type is "FeatureCollection" but we need its features
format = alt.DataFormat(property='features',type='json'))
data
base = alt.Chart(data).mark_geoshape(
stroke='black',
strokeWidth=1
).add_selection(
selectNeighbourhood
).encode(
color=alt.Color("properties.count:Q", title='Incedents of Crime'),
tooltip=[
alt.Tooltip('properties.Neighbourhood:N', title='Neighbourhood'),
alt.Tooltip('properties.count:Q', title='Incedents of Crime')
],
opacity=alt.condition(selectNeighbourhood, alt.value(0.75), alt.value(0.25), legend=None)
).transform_calculate(
Neighbourhood='datum.properties.Neighbourhood'
).project(
type='identity', reflectY=True
)
text_chart = alt.Chart(data).mark_text(
align='center',
baseline='middle',
fontSize=10,
fontWeight="bold",
dy=-8 # Adjust the y-offset of the text labels if necessary
).encode(
longitude='properties.centroid_lng:Q',
latitude='properties.centroid_lat:Q',
text='properties.mapid:N', # Use the 'mapid' column for text
tooltip=[
alt.Tooltip('properties.Neighbourhood:N', title='Neighbourhood'),
alt.Tooltip('properties.count:Q', title='Incedents of Crime')
]
)
map_with_mapid = base + text_chart
map_with_mapid = map_with_mapid.properties(
height=500,
width=600,
title="Distribution of Total Crime in Vancouver by Neighbourhood"
)
map_with_mapid
# Used https://altair-viz.github.io/gallery/multiline_tooltip.html as a resource
# Discard year 2023 as it is not over
df = crime_data[crime_data['YEAR'] < 2023]
#Combine different TYPEs into a couple similar types
df = df.replace('Break and Enter Commercial','Break And Enter')
df = df.replace('Break and Enter Residential/Other','Break And Enter')
df = df.replace('Homicide','Offence Against a Person')
df = df.replace('Other Theft','Other Theft')
df = df.replace('Theft from Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Vehicle','Vehicle Related Theft')
df = df.replace('Theft of Bicycle','Vehicle Related Theft')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Fatality)','Traffic Accident')
df = df.replace('Vehicle Collision or Pedestrian Struck (with Injury)','Traffic Accident')
#Change Year Column to Temporal
df['YEAR'] = pd.to_datetime(df.YEAR, format='%Y')
df
| Unnamed: 0 | Neighbourhood | Total - Age groups and average age of the population - 100% data | 0 to 14 years...3 | 0 to 4 years...4 | 5 to 9 years...5 | 10 to 14 years...6 | 15 to 64 years...7 | 15 to 19 years...8 | 20 to 24 years...9 | ... | MONTH | DAY | HOUR | MINUTE | HUNDRED_BLOCK | X | Y | Population density | Average cost of house in neighbour | Average income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2 | 3 | Dunbar-Southlands | 21425.0 | 3545.0 | 675.0 | 1225.0 | 1650.0 | 14215.0 | 1800.0 | 1740.0 | ... | 11.0 | 15.0 | 14.0 | 30.0 | 29XX W 31ST AVE | 487516.1816 | 5454623.638 | NaN | NaN | NaN |
| 3 | 4 | Dunbar-Southlands | 21425.0 | 3545.0 | 675.0 | 1225.0 | 1650.0 | 14215.0 | 1800.0 | 1740.0 | ... | 7.0 | 11.0 | 21.0 | 0.0 | 29XX W 31ST AVE | 487579.6067 | 5454613.684 | NaN | NaN | NaN |
| 4 | 5 | Dunbar-Southlands | 21425.0 | 3545.0 | 675.0 | 1225.0 | 1650.0 | 14215.0 | 1800.0 | 1740.0 | ... | 4.0 | 25.0 | 21.0 | 54.0 | 29XX W 33RD AVE | 487585.2638 | 5454405.082 | NaN | NaN | NaN |
| 5 | 6 | Dunbar-Southlands | 21425.0 | 3545.0 | 675.0 | 1225.0 | 1650.0 | 14215.0 | 1800.0 | 1740.0 | ... | 9.0 | 8.0 | 7.0 | 0.0 | 29XX W 33RD AVE | 487585.2638 | 5454405.082 | NaN | NaN | NaN |
| 6 | 7 | Dunbar-Southlands | 21425.0 | 3545.0 | 675.0 | 1225.0 | 1650.0 | 14215.0 | 1800.0 | 1740.0 | ... | 12.0 | 2.0 | 7.0 | 54.0 | 29XX W 38TH AVE | 487435.4586 | 5453876.477 | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 853943 | 853944 | Arbutus Ridge | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 11.0 | 9.0 | 10.0 | 23.0 | YEW ST / KING EDWARD AVE | 488652.0000 | 5455342.000 | NaN | NaN | NaN |
| 853944 | 853945 | Arbutus Ridge | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 9.0 | 24.0 | 12.0 | 31.0 | YEW ST / NANTON AVE | 488739.0000 | 5454999.000 | NaN | NaN | NaN |
| 853945 | 853946 | Arbutus Ridge | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 8.0 | 31.0 | 20.0 | 1.0 | YEW ST / W 33RD AVE | 488673.0000 | 5454386.000 | NaN | NaN | NaN |
| 853946 | 853947 | Arbutus Ridge | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 1.0 | 23.0 | 18.0 | 16.0 | YEW ST / W 37TH AVE | 488470.0000 | 5453964.000 | NaN | NaN | NaN |
| 853947 | 853948 | Arbutus Ridge | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 8.0 | 17.0 | 14.0 | 56.0 | YEW ST / W 39TH AVE | 488463.0000 | 5453756.000 | NaN | NaN | NaN |
848542 rows × 269 columns
# Find Neighbourhood Count
df = df.groupby(['YEAR', 'TYPE']).count()
df = df.rename(columns={"Unnamed: 0": "Count",})
df = pd.DataFrame(df.to_records())
df
| YEAR | TYPE | Count | Neighbourhood | Total - Age groups and average age of the population - 100% data | 0 to 14 years...3 | 0 to 4 years...4 | 5 to 9 years...5 | 10 to 14 years...6 | 15 to 64 years...7 | ... | MONTH | DAY | HOUR | MINUTE | HUNDRED_BLOCK | X | Y | Population density | Average cost of house in neighbour | Average income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2003-01-01 | Break And Enter | 10081 | 10081 | 8710 | 8710 | 8710 | 8710 | 8710 | 8710 | ... | 10081 | 10081 | 10081 | 10081 | 10081 | 10081 | 10081 | 0 | 0 | 0 |
| 1 | 2003-01-01 | Mischief | 6387 | 6387 | 4955 | 4955 | 4955 | 4955 | 4955 | 4955 | ... | 6387 | 6387 | 6387 | 6387 | 6387 | 6387 | 6387 | 0 | 0 | 0 |
| 2 | 2003-01-01 | Offence Against a Person | 3531 | 3529 | 2654 | 2654 | 2654 | 2654 | 2654 | 2654 | ... | 3531 | 3531 | 3531 | 3531 | 3531 | 3531 | 3531 | 0 | 0 | 0 |
| 3 | 2003-01-01 | Other Theft | 11426 | 11426 | 7969 | 7969 | 7969 | 7969 | 7969 | 7969 | ... | 11426 | 11426 | 11426 | 11426 | 11424 | 11426 | 11426 | 0 | 0 | 0 |
| 4 | 2003-01-01 | Traffic Accident | 1881 | 1849 | 1568 | 1568 | 1568 | 1568 | 1568 | 1568 | ... | 1881 | 1881 | 1881 | 1881 | 1881 | 1849 | 1849 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 115 | 2022-01-01 | Mischief | 5604 | 5604 | 3354 | 3354 | 3354 | 3354 | 3354 | 3354 | ... | 5604 | 5604 | 5604 | 5604 | 5604 | 5604 | 5604 | 0 | 0 | 0 |
| 116 | 2022-01-01 | Offence Against a Person | 3884 | 3882 | 2424 | 2424 | 2424 | 2424 | 2424 | 2424 | ... | 3884 | 3884 | 3884 | 3884 | 3884 | 3884 | 3884 | 0 | 0 | 0 |
| 117 | 2022-01-01 | Other Theft | 10731 | 10731 | 7272 | 7272 | 7272 | 7272 | 7272 | 7272 | ... | 10731 | 10731 | 10731 | 10731 | 10731 | 10731 | 10731 | 0 | 0 | 0 |
| 118 | 2022-01-01 | Traffic Accident | 1031 | 1028 | 839 | 839 | 839 | 839 | 839 | 839 | ... | 1031 | 1031 | 1031 | 1031 | 1031 | 1030 | 1030 | 0 | 0 | 0 |
| 119 | 2022-01-01 | Vehicle Related Theft | 9689 | 9689 | 6738 | 6738 | 6738 | 6738 | 6738 | 6738 | ... | 9689 | 9689 | 9689 | 9689 | 9689 | 9689 | 9689 | 0 | 0 | 0 |
120 rows × 269 columns
# Create a selection that chooses the nearest point & selects based on x-value
nearest = alt.selection_single(nearest=True, on='mouseover',
fields=['YEAR'])
# # The basic line
line = alt.Chart(df).mark_line().encode(
alt.X('YEAR:T', title = 'Year'),
y='Count',
color='TYPE:N'
)
# Transparent selectors across the chart. This is what tells us the x-value of the cursor
selectors = alt.Chart(df).mark_point().encode(
x='YEAR:T',
opacity=alt.value(0)
)
selectors_near = selectors.encode(
opacity=alt.condition(nearest, alt.value(0.5), alt.value(0.3))
).add_selection(nearest)
# Draw points on the line, and highlight based on selection
points = line.mark_circle().encode(
opacity=alt.condition(nearest, alt.value(1), alt.value(0))
).add_selection(nearest)
# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
text=alt.condition(nearest, 'Count:Q', alt.value(' '))
)
# Draw a rule at the location of the selection
rules = alt.Chart(df).mark_rule(color='gray').encode(
x='YEAR:T',
).transform_filter(
nearest
)
# Put the five layers into a chart and bind the data
layers = alt.layer(
line, selectors, points, text, rules
).properties(
width=600, height=300,
title = "Total Crime in Vancouver Over Time by Crime Type"
)
layers